Loading the Dataset

library(readr)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
library(HSAUR2)
library(SciViews)
library(scatterplot3d)
library(car)
## Loading required package: carData
library(lattice)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(ggridges)
library(ggvis)
## 
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
## 
##     resolution
library(ggthemes)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(gapminder)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
## 
## Attaching package: 'gganimate'
## The following object is masked from 'package:ggvis':
## 
##     view_static
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::recode()     masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ purrr::some()       masks car::some()
## ✖ lubridate::stamp()  masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(Hotelling)
## Loading required package: corpcor
## 
## Attaching package: 'Hotelling'
## 
## The following object is masked from 'package:dplyr':
## 
##     summarise
library(stats)
library(biotools)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.2
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## The following object is masked from 'package:car':
## 
##     logit
library(corrplot)
## corrplot 0.92 loaded
library(devtools)
## Loading required package: usethis
library(cluster)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(NbClust)
library(MASS)
library(gvlma)
library(leaps)
library(relaimpo)
## Loading required package: boot
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## Loading required package: survey
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## The following object is masked from 'package:ggvis':
## 
##     band
## 
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:boot':
## 
##     aml
## 
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## Loading required package: mitools
## This is the global version of package relaimpo.
## 
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## 
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(memisc)
## 
## Attaching package: 'memisc'
## 
## The following object is masked from 'package:Matrix':
## 
##     as.array
## 
## The following object is masked from 'package:magrittr':
## 
##     %$%
## 
## The following objects are masked from 'package:lubridate':
## 
##     as.interval, is.interval
## 
## The following object is masked from 'package:purrr':
## 
##     %@%
## 
## The following object is masked from 'package:tibble':
## 
##     view
## 
## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename, syms
## 
## The following object is masked from 'package:ggplot2':
## 
##     syms
## 
## The following object is masked from 'package:car':
## 
##     recode
## 
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## 
## The following object is masked from 'package:base':
## 
##     as.array
library(ROCR)
library(klaR)

clg <- read.csv("/Users/ajayvishnu/Desktop/RUTGERS/Spring_2023/Multivariate Analysis/Datasets/College_Data_cleaned.csv", row.names = 1)

attach(clg)

Analysing the Data

str(clg)
## 'data.frame':    30 obs. of  19 variables:
##  $ Private      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Apps         : int  15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
##  $ Accept       : int  11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
##  $ Acceptance_70: int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Enroll       : int  4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
##  $ Top10perc    : int  29 27 25 36 27 48 85 26 27 48 ...
##  $ Top25perc    : int  53 57 50 79 62 93 100 62 53 85 ...
##  $ F.Undergrad  : int  18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
##  $ P.Undergrad  : int  604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
##  $ Outstate     : int  10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
##  $ Room.Board   : int  3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
##  $ Books        : int  740 494 700 690 700 512 790 690 858 650 ...
##  $ Personal     : int  2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
##  $ PhD          : int  85 82 88 90 92 77 96 90 89 91 ...
##  $ Terminal     : int  89 88 94 95 98 96 96 95 93 99 ...
##  $ S.F.Ratio    : num  13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
##  $ perc.alumni  : int  20 6 17 19 19 19 11 16 9 11 ...
##  $ Expend       : int  8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
##  $ Grad.Rate    : int  73 55 57 77 72 63 66 58 37 65 ...

Data Dictionary

  • Group: A categorical variable indicating whether the college is public or private. 1 indicates a private college and 0 indicates a public college.
  • Private: A binary variable indicating whether the college is private or not. 1 indicates a private college and 0 indicates a public college.
  • Apps: The total number of applications received by the college.
  • Accept: The total number of applications accepted by the college.
  • Acceptance_70: The acceptance rate of the college. (Acceptance rate = Accept/Apps, >70% - 1, else - 0)
  • Enroll: The total number of students enrolled in the college.
  • Top10perc: The percentage of new students who ranked in the top 10% of their high school class.
  • Top25perc: The percentage of new students who ranked in the top 25% of their high school class.
  • F.Undergrad: The total number of full-time undergraduate students enrolled in the college.
  • P.Undergrad: The total number of part-time undergraduate students enrolled in the college.
  • Outstate: The out-of-state tuition fee for the college.
  • Room.Board: The cost of room and board for the college.
  • Books: The estimated cost of books and supplies for a year of study.
  • Personal: The estimated personal expenses for a year of study.
  • PhD: The percentage of faculty members who hold a PhD degree.
  • Terminal: The percentage of faculty members who hold a terminal degree in their field of study.
  • S.F.Ratio: The student-to-faculty ratio for the college.
  • perc.alumni: The percentage of alumni who donate to the college.
  • Expend: The instructional expenditure per student at the college.
  • Grad.Rate: The graduation rate of the college as a percentage.

Correlation Test

corrplot(cor(clg), type = "upper", method = "color")

  • The correlation matrix shows us that there is correlation between the columns.
  • Hence, Principal Component Analysis (PCA) can be used to reduce the number of columns for the analysis.

Principal Component Analysis (PCA)

###PCA

clg_pca <- prcomp(clg[,-1],scale=TRUE)

Scree diagram

fviz_eig(clg_pca, addlabels = TRUE)

  • The scree diagram shows us an elbow at 3 principal components.
  • So, this stats that the ideal number of components to be used is 3.

Biplot

fviz_pca_var(clg_pca,col.var = "cos2",
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
             repel = TRUE)

  • The distance between points in a biplot reflects generalised distance between the units.
  • Length of the vector reflects the variance of the variable.
  • Correlation of the varaibles reflected by the angle between them. Smaller the angle, greater the correlation.
  • We can see that the correlations are as expected.
  • Number of Applications and Acceptances are highly correlated, PhD and Terminal are highly correlated, Out State and Room.Board are highly correlated.

Plotting collges

res.pca <- PCA(clg[,-1], graph = FALSE)
fviz_pca_ind(res.pca)

  • The colleges have been plotted based on their PCA values.
  • It can be observed that all the colleges on the left of Y axis are Private colleges and the ones on the right are Public universities.
  • It is NOT based on the geographic location.
  • Imposing the biplot on this, gives better understanding of the visual.

Colleges and the biplot

fviz_pca_biplot(res.pca, repel = TRUE,
                col.var = "#FC4E07",
                )

  • It can be seen that the public universities clearly have higher number of application, acceptances, and enrollments compared to private institutions.
  • That too the colleges closer to the X-axis has higher numbers such as Rutgers, New Brunswick.
  • The personal expenses and the expenses for books are also higher towards the public universities.
  • The number of full time and part time undergrad students are more towards public universities.
  • Full time undergrad are more for Rutgers, New Brunswick and Part time undergrad are more for colleges Utah and Texas, Austin universities.
  • The university expenditure per student is the highest for CMU.

Clustering

Distance matrix

clg_dist <- get_dist(clg[,-1], stand = TRUE, method = "euclidean")

Kmeans optimal clusters

matstd_clg <- scale(clg[,-1])
#scaled
fviz_nbclust(matstd_clg, kmeans, method = "gap_stat")

  • This method shows that the optimal number of clusters is 1.
  • This doesn’t look right, so let us test this with another optimal cluster test.

Code for running without error

fviz_nbclust <- function (x, FUNcluster = NULL, method = c("silhouette", "wss", 
                                                           "gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(), 
                          barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue", 
                          print.summary = TRUE, ...) 
{
  set.seed(123)
  if (k.max < 2) 
    stop("k.max must bet > = 2")
  method = match.arg(method)
  if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in% 
                                                  names(x))) 
    stop("x should be an object of class matrix/data.frame or ", 
         "an object created by the function NbClust() [NbClust package].")
  if (inherits(x, "list") & "Best.nc" %in% names(x)) {
    best_nc <- x$Best.nc
    if (any(class(best_nc) == "numeric") ) 
      print(best_nc)
    else if (any(class(best_nc) == "matrix") )
      .viz_NbClust(x, print.summary, barfill, barcolor)
  }
  else if (is.null(FUNcluster)) 
    stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
  else if (!is.function(FUNcluster)) {
    stop("The argument FUNcluster should be a function. ", 
         "Check if you're not overriding the specified function name somewhere.")
  }
  else if (method %in% c("silhouette", "wss")) {
    if (is.data.frame(x)) 
      x <- as.matrix(x)
    if (is.null(diss)) 
      diss <- stats::dist(x)
    v <- rep(0, k.max)
    if (method == "silhouette") {
      for (i in 2:k.max) {
        clust <- FUNcluster(x, i, ...)
        v[i] <- .get_ave_sil_width(diss, clust$cluster)
      }
    }
    else if (method == "wss") {
      for (i in 1:k.max) {
        clust <- FUNcluster(x, i, ...)
        v[i] <- .get_withinSS(diss, clust$cluster)
      }
    }
    df <- data.frame(clusters = as.factor(1:k.max), y = v, 
                     stringsAsFactors = TRUE)
    ylab <- "Total Within Sum of Square"
    if (method == "silhouette") 
      ylab <- "Average silhouette width"
    p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1, 
                        color = linecolor, ylab = ylab, xlab = "Number of clusters k", 
                        main = "Optimal number of clusters")
    if (method == "silhouette") 
      p <- p + geom_vline(xintercept = which.max(v), linetype = 2, 
                          color = linecolor)
    return(p)
  }
  else if (method == "gap_stat") {
    extra_args <- list(...)
    gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max, 
                                 B = nboot, verbose = verbose, ...)
    if (!is.null(extra_args$maxSE)) 
      maxSE <- extra_args$maxSE
    else maxSE <- list(method = "firstSEmax", SE.factor = 1)
    p <- fviz_gap_stat(gap_stat, linecolor = linecolor, 
                       maxSE = maxSE)
    return(p)
  }
}

.viz_NbClust <- function (x, print.summary = TRUE, barfill = "steelblue", 
                          barcolor = "steelblue") 
{
  best_nc <- x$Best.nc
  if (any(class(best_nc) == "numeric") )
    print(best_nc)
  else if (any(class(best_nc) == "matrix") ) {
    best_nc <- as.data.frame(t(best_nc), stringsAsFactors = TRUE)
    best_nc$Number_clusters <- as.factor(best_nc$Number_clusters)
    if (print.summary) {
      ss <- summary(best_nc$Number_clusters)
      cat("Among all indices: \n===================\n")
      for (i in 1:length(ss)) {
        cat("*", ss[i], "proposed ", names(ss)[i], 
            "as the best number of clusters\n")
      }
      cat("\nConclusion\n=========================\n")
      cat("* According to the majority rule, the best number of clusters is ", 
          names(which.max(ss)), ".\n\n")
    }
    df <- data.frame(Number_clusters = names(ss), freq = ss, 
                     stringsAsFactors = TRUE)
    p <- ggpubr::ggbarplot(df, x = "Number_clusters", 
                           y = "freq", fill = barfill, color = barcolor) + 
      labs(x = "Number of clusters k", y = "Frequency among all indices", 
           title = paste0("Optimal number of clusters - k = ", 
                          names(which.max(ss))))
    return(p)
  }
}

Another test for optimal clusters

res.nbclust <- clg[,-1] %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all") 
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 6 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
fviz_nbclust(res.nbclust, ggtheme = theme_minimal())
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 6 proposed  2 as the best number of clusters
## * 8 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 1 proposed  7 as the best number of clusters
## * 1 proposed  9 as the best number of clusters
## * 6 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  3 .

  • This test shows the optimal number of clusters is 3.
  • Visualizing the clusters will give us a better understanding of how they are classified.
set.seed(123)
km.res <- kmeans(matstd_clg, 3, nstart = 25)
# Visualize
fviz_cluster(km.res, data = matstd_clg,
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal())

  • In the next step we check for any correlation and then infer from the clustering visuals.
  • We do this by using the scaling function.
pam.res <- pam(matstd_clg, 3)
fviz_cluster(pam.res)

  • We observe that all the universities in Cluster 3 (blue) are Private Universities.
  • Except two of Private universities rest all fell under Cluster 3.
  • The Public Universities are placed in the remaining two clusters.
  • The ones in Cluster 1 (Red) have very high number of applications, acceptances and enrollments (in thousands).
  • The Cluster 2 (green) is also of Public universities but the ones with fewer applications, acceptances, and enrollments (in hundreds).
res.hc <- matstd_clg %>% scale() %>% dist(method = "euclidean") %>%
  hclust(method = "ward.D2")

fviz_dend(res.hc, k = 3, # Cut in four groups
          cex = 0.5, # label size
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE # Add rectangle around groups
          )
## Warning in get_col(col, k): Length of color vector was longer than the number
## of clusters - first k elements are used
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  • In this clustering, all the universities in teal colour are Private universities.
  • The universities in Blue are all Public and the ones in Yellow cluster are a mixture of public and private universities.
  • Even using this method, we are able to differentiate between public and private upto some extent.
  • The Blue and Yellow differentiation is mostly focused on the number of applications, acceptances, and enrollments. The ones in blue have much higher numbers when compared to the ones in yellow.

Exploratory Factor Analysis (EFA)

EFA

fit.pc <- principal(clg[,-1], nfactors=5, rotate="varimax")

EFA Scores for each college

fit.pc$scores
##                                               RC1         RC3         RC4
## Virginia Tech                         1.485468071  0.21131139 -0.21998370
## Maryland, Baltimore County           -0.481006334  0.76030425 -0.31548733
## University of Kansas                  0.177350813  1.40427674  0.02317481
## Rutgers, New Brunswick                3.133346443 -0.54184707  0.10004650
## New Jersey Institute of Technology   -0.698565381  0.57433793  0.29395115
## Penn State                            1.263617108  0.06342282  0.84411017
## University of California, Irvine      0.283893077  0.27706042  2.08221597
## Rutgers ,  Newark                    -0.677051736  0.74194350  0.27258849
## University of Utah                   -0.356647575  3.34936020 -1.18308501
## University of Texas, Austin           1.271969226  1.44186502  0.74986789
## SUNY College at Geneseo               0.008424103 -1.52037815  1.98538731
## University of Wisconsin at Madison    1.516380130  0.26320285 -0.09742845
## St. Mary's College of Maryland       -1.124429195  0.14350357  1.14223565
## Washington State University           0.185200681  0.95130029  0.31005570
## Lincoln University                   -1.161856724  0.18943920 -0.60705986
## Montclair State University           -0.377018822 -0.01890837 -0.20013527
## University of Massachusetts, Amherst  1.576944277 -0.15158429 -1.40007992
## Mount Vernon Nazarene College        -0.364193850 -1.23343747 -0.82760015
## Carnegie Mellon University           -0.224847119 -0.25735884  1.07996340
## Campbellsville College               -1.032269951 -0.40964800  0.33895063
## Hillsdale College                    -0.206749033 -1.13226255  0.14398325
## Austin College                       -0.377970174 -0.66656689  0.56118337
## Fresno Pacific College               -1.061532078 -0.09982983  0.80092587
## Roger Williams University             0.162940212 -1.47084476 -2.46329998
## Point Park College                   -0.114030259 -0.81216742 -1.83109281
## Bethany College                      -0.819570894  0.05679587 -0.65195750
## Caldwell College                     -0.859520115 -0.02187551 -0.34922095
## Southern California College          -0.792310218 -0.03837150 -0.97211902
## University of Evansville              0.021108340 -1.20027859  0.28912388
## Cornell College                      -0.357073020 -0.85276482  0.10078590
##                                              RC2         RC5
## Virginia Tech                         0.93933366 -1.37146513
## Maryland, Baltimore County           -1.04433771  0.69037664
## University of Kansas                  0.02120010 -0.49041703
## Rutgers, New Brunswick               -1.04843438  0.72216022
## New Jersey Institute of Technology   -0.05933922  1.65202697
## Penn State                           -0.32614786 -0.30181199
## University of California, Irvine     -0.18008499  1.01341401
## Rutgers ,  Newark                    -0.74251293  1.09254223
## University of Utah                    0.52779424 -0.09370849
## University of Texas, Austin          -1.02061303 -0.72998530
## SUNY College at Geneseo              -1.37428272 -0.58696470
## University of Wisconsin at Madison    1.12649462 -0.17115117
## St. Mary's College of Maryland       -0.13265786  0.40476349
## Washington State University           0.67877810 -0.88696370
## Lincoln University                   -0.60445947 -0.01578063
## Montclair State University           -2.23511230  0.88218839
## University of Massachusetts, Amherst  0.24209556  0.12073221
## Mount Vernon Nazarene College        -1.01965290 -1.24747394
## Carnegie Mellon University            2.18335679  2.38731938
## Campbellsville College               -1.37511652 -1.29394799
## Hillsdale College                     1.25164830  0.35262799
## Austin College                        1.23320379  0.05805328
## Fresno Pacific College                0.70563964 -1.79219884
## Roger Williams University            -0.17658194  1.05281187
## Point Park College                    0.04786513  0.68789793
## Bethany College                       0.98875049 -1.46132669
## Caldwell College                     -0.19445128  1.06430250
## Southern California College          -0.33594369 -0.65912083
## University of Evansville              0.43124829 -0.68883908
## Cornell College                       1.49232006 -0.39006159
  • From observation we can see that all Private Universities had negative RC1 and RC4.
  • Except a few public universities, rest all have positive RC1 and RC4.
  • We can further look at the fit.pc diagram to understand a better classification based on these values.

Parallel Scree plot

fa.parallel(clg[,-1])

## Parallel analysis suggests that the number of factors =  2  and the number of components =  2
  • The scree plot shows that the ideal factors for both PCA and EFA is 3.
  • This is obtained from the elbow from both the lines.
fa.diagram(fit.pc)

  • As the scree plot suggests, we take the first three components - RC1, RC3, RC4.
  • The diagram shows that Applications, Acceptances, Enrollments, Graduation Rate all come under RC1.
  • We can also observe that all the Private universities have a negative RC1. A few of Public universities also have negative value.
  • In similar lines, Rooms, Personal expenses, and Part time undergrad is related to RC3.
  • Here we can observe a more clear classification, as all Private universties have a negative RC3 and all Public universities have a positive RC3.
  • Using the EFA, we were clearly able to differentiate between the university type.
summary(vss(clg[,-1]))
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## 
## Very Simple Structure
## VSS complexity 1 achieves a maximimum of 0.68  with  2  factors
## VSS complexity 2 achieves a maximimum of 0.85  with  2  factors
## 
## The Velicer MAP criterion achieves a minimum of 0.11  with  4  factors
## 
  • The VSS plot shows that from 3 factors the line is stable.
  • This implies that the ideal number of factors is 3, justifies the result we got from scree plot too.

Multiple Regression

Logistic Regression

str(clg)
## 'data.frame':    30 obs. of  19 variables:
##  $ Private      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Apps         : int  15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
##  $ Accept       : int  11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
##  $ Acceptance_70: int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Enroll       : int  4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
##  $ Top10perc    : int  29 27 25 36 27 48 85 26 27 48 ...
##  $ Top25perc    : int  53 57 50 79 62 93 100 62 53 85 ...
##  $ F.Undergrad  : int  18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
##  $ P.Undergrad  : int  604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
##  $ Outstate     : int  10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
##  $ Room.Board   : int  3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
##  $ Books        : int  740 494 700 690 700 512 790 690 858 650 ...
##  $ Personal     : int  2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
##  $ PhD          : int  85 82 88 90 92 77 96 90 89 91 ...
##  $ Terminal     : int  89 88 94 95 98 96 96 95 93 99 ...
##  $ S.F.Ratio    : num  13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
##  $ perc.alumni  : int  20 6 17 19 19 19 11 16 9 11 ...
##  $ Expend       : int  8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
##  $ Grad.Rate    : int  73 55 57 77 72 63 66 58 37 65 ...
clg[clg$Private == 0,]$Private <- "Public"
clg[clg$Private == 1,]$Private <- "Private"
clg$Private <- as.factor(clg$Private)

clg$Acceptance_70 <- ifelse(test=clg$Acceptance_70 == 0, yes="Acceptance <= 70%", no="Acceptance > 70%") 
clg$Acceptance_70  <- as.factor(clg$Acceptance_70 )
str(clg)
## 'data.frame':    30 obs. of  19 variables:
##  $ Private      : Factor w/ 2 levels "Private","Public": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps         : int  15712 4269 8579 48094 1879 19315 15698 5785 5095 14752 ...
##  $ Accept       : int  11719 2594 5561 26330 1216 10344 10775 2690 4491 9572 ...
##  $ Acceptance_70: Factor w/ 2 levels "Acceptance <= 70%",..: 2 1 1 1 1 1 1 1 2 1 ...
##  $ Enroll       : int  4277 985 3681 4520 483 3450 2478 499 2400 5329 ...
##  $ Top10perc    : int  29 27 25 36 27 48 85 26 27 48 ...
##  $ Top25perc    : int  53 57 50 79 62 93 100 62 53 85 ...
##  $ F.Undergrad  : int  18511 6476 17880 21401 3311 28938 12677 4005 13894 30017 ...
##  $ P.Undergrad  : int  604 2592 1673 3712 1646 2025 864 1886 8374 5189 ...
##  $ Outstate     : int  10260 8594 6994 7410 8832 10645 12024 7410 6857 5130 ...
##  $ Room.Board   : int  3176 4408 3384 4748 5376 4060 5302 4748 3975 3309 ...
##  $ Books        : int  740 494 700 690 700 512 790 690 858 650 ...
##  $ Personal     : int  2200 2768 2681 2009 1850 2394 1818 2009 3093 3140 ...
##  $ PhD          : int  85 82 88 90 92 77 96 90 89 91 ...
##  $ Terminal     : int  89 88 94 95 98 96 96 95 93 99 ...
##  $ S.F.Ratio    : num  13.8 18.4 13.7 19.5 13.5 18.1 16.1 17.4 12.8 19.7 ...
##  $ perc.alumni  : int  20 6 17 19 19 19 11 16 9 11 ...
##  $ Expend       : int  8944 7618 9657 10474 12529 8992 15934 11878 9275 7837 ...
##  $ Grad.Rate    : int  73 55 57 77 72 63 66 58 37 65 ...
xtabs(~ Acceptance_70 + Private, data=clg)
##                    Private
## Acceptance_70       Private Public
##   Acceptance <= 70%       3     12
##   Acceptance > 70%       10      5
logistic_simple <- glm(Acceptance_70 ~ Private, data=clg, family="binomial")
summary(logistic_simple)
## 
## Call:
## glm(formula = Acceptance_70 ~ Private, family = "binomial", data = clg)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.71251  -0.83463  -0.05513   0.72438   1.56447  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.2040     0.6583   1.829   0.0674 .
## PrivatePublic  -2.0794     0.8466  -2.456   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41.589  on 29  degrees of freedom
## Residual deviance: 34.642  on 28  degrees of freedom
## AIC: 38.642
## 
## Number of Fisher Scoring iterations: 4
predicted.data <- data.frame(probability.of.acpt=logistic_simple$fitted.values,UniType=clg$Private)
xtabs(~ probability.of.acpt + UniType, data=predicted.data)
##                    UniType
## probability.of.acpt Private Public
##   0.294117647059127       0     17
##   0.769230769230409      13      0
logistic <- glm(Acceptance_70 ~ Private+Enroll+Top10perc+Top25perc+F.Undergrad+P.Undergrad+Terminal+S.F.Ratio+PhD+Grad.Rate, data=clg, family="binomial")
summary(logistic)
## 
## Call:
## glm(formula = Acceptance_70 ~ Private + Enroll + Top10perc + 
##     Top25perc + F.Undergrad + P.Undergrad + Terminal + S.F.Ratio + 
##     PhD + Grad.Rate, family = "binomial", data = clg)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.46267  -0.57981   0.00524   0.34716   2.33268  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   19.5120484  9.0770348   2.150   0.0316 *
## PrivatePublic -3.4850311  2.5468530  -1.368   0.1712  
## Enroll        -0.0006646  0.0023798  -0.279   0.7800  
## Top10perc     -0.1584332  0.1903560  -0.832   0.4052  
## Top25perc      0.0064126  0.1530075   0.042   0.9666  
## F.Undergrad    0.0003174  0.0004761   0.667   0.5051  
## P.Undergrad    0.0007401  0.0005138   1.440   0.1498  
## Terminal      -0.1619397  0.2116517  -0.765   0.4442  
## S.F.Ratio     -0.5750719  0.3089933  -1.861   0.0627 .
## PhD           -0.0389866  0.2206205  -0.177   0.8597  
## Grad.Rate      0.1590752  0.1380443   1.152   0.2492  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41.589  on 29  degrees of freedom
## Residual deviance: 19.816  on 19  degrees of freedom
## AIC: 41.816
## 
## Number of Fisher Scoring iterations: 7
predicted.data <- data.frame(probability.of.acpt=logistic$fitted.values,acpt=clg$Acceptance_70)
predicted.data <- predicted.data[order(predicted.data$probability.of.acpt, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)
## Lastly, we can plot the predicted probabilities for each sample having
## heart disease and color by whether or not they actually had heart disease
ggplot(data=predicted.data, aes(x=rank, y=probability.of.acpt)) +
geom_point(aes(color=acpt), alpha=1, shape=4, stroke=2) +
xlab("Index") +
ylab("Predicted probability of getting heart disease")

roc(clg$Acceptance_70,logistic$fitted.values,plot=TRUE, legacy.axes=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, percent=TRUE, print.auc=TRUE)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = clg$Acceptance_70, predictor = logistic$fitted.values,     percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "#377eb8", lwd = 4,     print.auc = TRUE)
## 
## Data: logistic$fitted.values in 15 controls (clg$Acceptance_70 Acceptance <= 70%) < 15 cases (clg$Acceptance_70 Acceptance > 70%).
## Area under the curve: 92.89%
roc(clg$Acceptance_70, logistic_simple$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, print.auc=TRUE)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = clg$Acceptance_70, predictor = logistic_simple$fitted.values,     percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "#377eb8", lwd = 4,     print.auc = TRUE)
## 
## Data: logistic_simple$fitted.values in 15 controls (clg$Acceptance_70 Acceptance <= 70%) < 15 cases (clg$Acceptance_70 Acceptance > 70%).
## Area under the curve: 73.33%
plot.roc(clg$Acceptance_70, logistic$fitted.values, percent=TRUE, col="#4daf4a", lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=40)
## Setting levels: control = Acceptance <= 70%, case = Acceptance > 70%
## Setting direction: controls < cases
legend("bottomright", legend=c("Simple", "Non Simple"), col=c("#377eb8", "#4daf4a"), lwd=4) 

LDA

clg1 <- read.csv("/Users/ajayvishnu/Desktop/RUTGERS/Spring_2023/Multivariate Analysis/Datasets/College_Data_cleaned.csv", row.names = 1)

attach(clg1)
## The following objects are masked from clg:
## 
##     Accept, Acceptance_70, Apps, Books, Enroll, Expend, F.Undergrad,
##     Grad.Rate, Outstate, P.Undergrad, perc.alumni, Personal, PhD,
##     Private, Room.Board, S.F.Ratio, Terminal, Top10perc, Top25perc
clg1[clg1$Private == 0,]$Private <- "Public"
clg1[clg1$Private == 1,]$Private <- "Private"
clg1.data <- as.matrix(clg1[,c(2:19)])
clg1_raw <- cbind(clg1.data, as.numeric(as.factor(clg1$Private))-1)
colnames(clg1_raw)[19] <- "UniType"
smp_size_raw <- floor(0.75 * nrow(clg1_raw))
train_ind_raw <- sample(nrow(clg1_raw), size = smp_size_raw)
train_raw.df <- as.data.frame(clg1_raw[train_ind_raw, ])
test_raw.df <- as.data.frame(clg1_raw[-train_ind_raw, ])
clg1_raw.lda <- lda(formula = train_raw.df$UniType ~ ., data = train_raw.df)
plot(clg1_raw.lda, dimen = 1, type = "b")

clg1_raw.lda.predict <- predict(clg1_raw.lda, newdata = test_raw.df)

clg1_raw.lda.predict.posteriors <- as.data.frame(clg1_raw.lda.predict$posterior)

pred <- prediction(clg1_raw.lda.predict.posteriors[,2], test_raw.df$UniType)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .25, y = .65 ,paste("AUC = ", round(auc.train[[1]],3), sep = ""))